Lab6

Author

Sierra Mattiar

Published

April 4, 2025

Load Packages

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
✔ broom        1.0.7     ✔ rsample      1.2.1
✔ dials        1.4.0     ✔ tune         1.3.0
✔ infer        1.0.7     ✔ workflows    1.2.0
✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
✔ parsnip      1.3.1     ✔ yardstick    1.3.2
✔ recipes      1.1.1     
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
library(readr)
library(powerjoin)
library(skimr)
library(visdat)
library(ggpubr)
library(ggcorrplot)
library(workflowsets)
library(recipes)
# List all .txt files in the data folder
data_files <- list.files("data", pattern = "\\.txt$", full.names = TRUE)

# Read each file using read_delim and store in a list
camels_list <- map(data_files, read_delim, delim = ";")
Rows: 671 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ";"
chr (3): gauge_id, high_prec_timing, low_prec_timing
dbl (9): p_mean, pet_mean, p_seasonality, frac_snow, aridity, high_prec_freq...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 671 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: ";"
chr (3): gauge_id, geol_1st_class, geol_2nd_class
dbl (5): glim_1st_class_frac, glim_2nd_class_frac, carbonate_rocks_frac, geo...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 671 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ";"
chr  (1): gauge_id
dbl (13): q_mean, runoff_ratio, slope_fdc, baseflow_index, stream_elas, q5, ...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 671 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ";"
chr  (1): gauge_id
dbl (11): soil_depth_pelletier, soil_depth_statsgo, soil_porosity, soil_cond...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 671 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: ";"
chr (1): gauge_id
dbl (6): gauge_lat, gauge_lon, elev_mean, slope_mean, area_gages2, area_geos...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 671 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ";"
chr (2): gauge_id, dom_land_cover
dbl (8): frac_forest, lai_max, lai_diff, gvf_max, gvf_diff, dom_land_cover_f...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Join all data frames using power_full_join by gauge_id
camels_data <- reduce(camels_list, power_full_join, by = "gauge_id")
# Look at structure of the combined data
glimpse(camels_data)
Rows: 671
Columns: 58
$ gauge_id             <chr> "01013500", "01022500", "01030500", "01031500", "…
$ p_mean               <dbl> 3.126679, 3.608126, 3.274405, 3.522957, 3.323146,…
$ pet_mean             <dbl> 1.971555, 2.119256, 2.043594, 2.071324, 2.090024,…
$ p_seasonality        <dbl> 0.187940259, -0.114529586, 0.047358189, 0.1040905…
$ frac_snow            <dbl> 0.3134404, 0.2452590, 0.2770184, 0.2918365, 0.280…
$ aridity              <dbl> 0.6305587, 0.5873564, 0.6241114, 0.5879503, 0.628…
$ high_prec_freq       <dbl> 12.95, 20.55, 17.15, 18.90, 20.10, 13.50, 17.50, …
$ high_prec_dur        <dbl> 1.348958, 1.205279, 1.207746, 1.148936, 1.165217,…
$ high_prec_timing     <chr> "son", "son", "son", "son", "son", "jja", "son", …
$ low_prec_freq        <dbl> 202.20, 233.65, 215.60, 227.35, 235.90, 193.50, 2…
$ low_prec_dur         <dbl> 3.427119, 3.662226, 3.514262, 3.473644, 3.691706,…
$ low_prec_timing      <chr> "mam", "jja", "djf", "djf", "djf", "mam", "mam", …
$ geol_1st_class       <chr> "Siliciclastic sedimentary rocks", "Acid plutonic…
$ glim_1st_class_frac  <dbl> 0.8159044, 0.5906582, 0.5733054, 0.4489279, 0.308…
$ geol_2nd_class       <chr> "Basic volcanic rocks", "Siliciclastic sedimentar…
$ glim_2nd_class_frac  <dbl> 0.17972945, 0.16461821, 0.28701001, 0.44386282, 0…
$ carbonate_rocks_frac <dbl> 0.000000000, 0.000000000, 0.052140094, 0.02625797…
$ geol_porostiy        <dbl> 0.1714, 0.0710, 0.1178, 0.0747, 0.0522, 0.0711, 0…
$ geol_permeability    <dbl> -14.7019, -14.2138, -14.4918, -14.8410, -14.4819,…
$ q_mean               <dbl> 1.699155, 2.173062, 1.820108, 2.030242, 2.182870,…
$ runoff_ratio         <dbl> 0.5434375, 0.6022689, 0.5558590, 0.5762893, 0.656…
$ slope_fdc            <dbl> 1.528219, 1.776280, 1.871110, 1.494019, 1.415939,…
$ baseflow_index       <dbl> 0.5852260, 0.5544784, 0.5084407, 0.4450905, 0.473…
$ stream_elas          <dbl> 1.8453242, 1.7027824, 1.3775052, 1.6486930, 1.510…
$ q5                   <dbl> 0.24110613, 0.20473436, 0.10714920, 0.11134535, 0…
$ q95                  <dbl> 6.373021, 7.123049, 6.854887, 8.010503, 8.095148,…
$ high_q_freq          <dbl> 6.10, 3.90, 12.25, 18.90, 14.95, 14.10, 16.05, 16…
$ high_q_dur           <dbl> 8.714286, 2.294118, 7.205882, 3.286957, 2.577586,…
$ low_q_freq           <dbl> 41.35, 65.15, 89.25, 94.80, 71.55, 58.90, 82.20, …
$ low_q_dur            <dbl> 20.170732, 17.144737, 19.402174, 14.697674, 12.77…
$ zero_q_freq          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0…
$ hfd_mean             <dbl> 207.25, 166.25, 184.90, 181.00, 184.80, 197.20, 1…
$ soil_depth_pelletier <dbl> 7.4047619, 17.4128079, 19.0114144, 7.2525570, 5.3…
$ soil_depth_statsgo   <dbl> 1.248408, 1.491846, 1.461363, 1.279047, 1.392779,…
$ soil_porosity        <dbl> 0.4611488, 0.4159055, 0.4590910, 0.4502360, 0.422…
$ soil_conductivity    <dbl> 1.106522, 2.375005, 1.289807, 1.373292, 2.615154,…
$ max_water_content    <dbl> 0.5580548, 0.6262289, 0.6530198, 0.5591227, 0.561…
$ sand_frac            <dbl> 27.84183, 59.39016, 32.23546, 35.26903, 55.16313,…
$ silt_frac            <dbl> 55.15694, 28.08094, 51.77918, 50.84123, 34.18544,…
$ clay_frac            <dbl> 16.275732, 12.037646, 14.776824, 12.654125, 10.30…
$ water_frac           <dbl> 5.3766978, 1.2269127, 1.6343449, 0.6745936, 0.000…
$ organic_frac         <dbl> 0.4087168, 0.0000000, 1.3302776, 0.0000000, 0.000…
$ other_frac           <dbl> 0.0000000, 0.3584723, 0.0220161, 0.0000000, 0.147…
$ gauge_lat            <dbl> 47.23739, 44.60797, 45.50097, 45.17501, 44.86920,…
$ gauge_lon            <dbl> -68.58264, -67.93524, -68.30596, -69.31470, -69.9…
$ elev_mean            <dbl> 250.31, 92.68, 143.80, 247.80, 310.38, 615.70, 47…
$ slope_mean           <dbl> 21.64152, 17.79072, 12.79195, 29.56035, 49.92122,…
$ area_gages2          <dbl> 2252.70, 573.60, 3676.17, 769.05, 909.10, 383.82,…
$ area_geospa_fabric   <dbl> 2303.95, 620.38, 3676.09, 766.53, 904.94, 396.10,…
$ frac_forest          <dbl> 0.9063, 0.9232, 0.8782, 0.9548, 0.9906, 1.0000, 1…
$ lai_max              <dbl> 4.167304, 4.871392, 4.685200, 4.903259, 5.086811,…
$ lai_diff             <dbl> 3.340732, 3.746692, 3.665543, 3.990843, 4.300978,…
$ gvf_max              <dbl> 0.8045674, 0.8639358, 0.8585020, 0.8706685, 0.891…
$ gvf_diff             <dbl> 0.3716482, 0.3377125, 0.3513934, 0.3986194, 0.445…
$ dom_land_cover_frac  <dbl> 0.8834519, 0.8204934, 0.9752580, 1.0000000, 0.850…
$ dom_land_cover       <chr> "    Mixed Forests", "    Mixed Forests", "    Mi…
$ root_depth_50        <dbl> NA, 0.2374345, NA, 0.2500000, 0.2410270, 0.225615…
$ root_depth_99        <dbl> NA, 2.238444, NA, 2.400000, 2.340180, 2.237435, 2…
# Summarize the dataset
skim(camels_data)
Data summary
Name camels_data
Number of rows 671
Number of columns 58
_______________________
Column type frequency:
character 6
numeric 52
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
gauge_id 0 1.00 8 8 0 671 0
high_prec_timing 0 1.00 3 3 0 4 0
low_prec_timing 0 1.00 3 3 0 4 0
geol_1st_class 0 1.00 12 31 0 12 0
geol_2nd_class 138 0.79 12 31 0 13 0
dom_land_cover 0 1.00 12 38 0 12 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
p_mean 0 1.00 3.26 1.41 0.64 2.37 3.23 3.78 8.94 ▃▇▂▁▁
pet_mean 0 1.00 2.79 0.55 1.90 2.34 2.69 3.15 4.74 ▇▇▅▂▁
p_seasonality 0 1.00 -0.04 0.53 -1.44 -0.26 0.08 0.22 0.92 ▁▂▃▇▂
frac_snow 0 1.00 0.18 0.20 0.00 0.04 0.10 0.22 0.91 ▇▂▁▁▁
aridity 0 1.00 1.06 0.62 0.22 0.70 0.86 1.27 5.21 ▇▂▁▁▁
high_prec_freq 0 1.00 20.93 4.55 7.90 18.50 22.00 24.23 32.70 ▂▃▇▇▁
high_prec_dur 0 1.00 1.35 0.19 1.08 1.21 1.28 1.44 2.09 ▇▅▂▁▁
low_prec_freq 0 1.00 254.65 35.12 169.90 232.70 255.85 278.92 348.70 ▂▅▇▅▁
low_prec_dur 0 1.00 5.95 3.20 2.79 4.24 4.95 6.70 36.51 ▇▁▁▁▁
glim_1st_class_frac 0 1.00 0.79 0.20 0.30 0.61 0.83 1.00 1.00 ▁▃▃▃▇
glim_2nd_class_frac 0 1.00 0.16 0.14 0.00 0.00 0.14 0.27 0.49 ▇▃▃▂▁
carbonate_rocks_frac 0 1.00 0.12 0.26 0.00 0.00 0.00 0.04 1.00 ▇▁▁▁▁
geol_porostiy 3 1.00 0.13 0.07 0.01 0.07 0.13 0.19 0.28 ▇▆▇▇▂
geol_permeability 0 1.00 -13.89 1.18 -16.50 -14.77 -13.96 -13.00 -10.90 ▂▅▇▅▂
q_mean 1 1.00 1.49 1.54 0.00 0.63 1.13 1.75 9.69 ▇▁▁▁▁
runoff_ratio 1 1.00 0.39 0.23 0.00 0.24 0.35 0.51 1.36 ▆▇▂▁▁
slope_fdc 1 1.00 1.24 0.51 0.00 0.90 1.28 1.63 2.50 ▂▅▇▇▁
baseflow_index 0 1.00 0.49 0.16 0.01 0.40 0.50 0.60 0.98 ▁▃▇▅▁
stream_elas 1 1.00 1.83 0.78 -0.64 1.32 1.70 2.23 6.24 ▁▇▃▁▁
q5 1 1.00 0.17 0.27 0.00 0.01 0.08 0.22 2.42 ▇▁▁▁▁
q95 1 1.00 5.06 4.94 0.00 2.07 3.77 6.29 31.82 ▇▂▁▁▁
high_q_freq 1 1.00 25.74 29.07 0.00 6.41 15.10 35.79 172.80 ▇▂▁▁▁
high_q_dur 1 1.00 6.91 10.07 0.00 1.82 2.85 7.55 92.56 ▇▁▁▁▁
low_q_freq 1 1.00 107.62 82.24 0.00 37.44 96.00 162.14 356.80 ▇▆▅▂▁
low_q_dur 1 1.00 22.28 21.66 0.00 10.00 15.52 26.91 209.88 ▇▁▁▁▁
zero_q_freq 1 1.00 0.03 0.11 0.00 0.00 0.00 0.00 0.97 ▇▁▁▁▁
hfd_mean 1 1.00 182.52 33.53 112.25 160.16 173.77 204.05 287.75 ▂▇▃▂▁
soil_depth_pelletier 0 1.00 10.87 16.24 0.27 1.00 1.23 12.89 50.00 ▇▁▁▁▁
soil_depth_statsgo 0 1.00 1.29 0.27 0.40 1.11 1.46 1.50 1.50 ▁▁▂▂▇
soil_porosity 0 1.00 0.44 0.02 0.37 0.43 0.44 0.46 0.68 ▃▇▁▁▁
soil_conductivity 0 1.00 1.74 1.52 0.45 0.93 1.35 1.93 13.96 ▇▁▁▁▁
max_water_content 0 1.00 0.53 0.15 0.09 0.43 0.56 0.64 1.05 ▁▅▇▃▁
sand_frac 0 1.00 36.47 15.63 8.18 25.44 35.27 44.46 91.98 ▅▇▅▁▁
silt_frac 0 1.00 33.86 13.25 2.99 23.95 34.06 43.64 67.77 ▂▆▇▆▁
clay_frac 0 1.00 19.89 9.32 1.85 14.00 18.66 25.42 50.35 ▃▇▅▂▁
water_frac 0 1.00 0.10 0.94 0.00 0.00 0.00 0.00 19.35 ▇▁▁▁▁
organic_frac 0 1.00 0.59 3.84 0.00 0.00 0.00 0.00 57.86 ▇▁▁▁▁
other_frac 0 1.00 9.82 16.83 0.00 0.00 1.31 11.74 99.38 ▇▁▁▁▁
gauge_lat 0 1.00 39.24 5.21 27.05 35.70 39.25 43.21 48.82 ▂▃▇▆▅
gauge_lon 0 1.00 -95.79 16.21 -124.39 -110.41 -92.78 -81.77 -67.94 ▆▃▇▇▅
elev_mean 0 1.00 759.42 786.00 10.21 249.67 462.72 928.88 3571.18 ▇▂▁▁▁
slope_mean 0 1.00 46.20 47.12 0.82 7.43 28.80 73.17 255.69 ▇▂▂▁▁
area_gages2 0 1.00 792.62 1701.95 4.03 122.28 329.68 794.30 25791.04 ▇▁▁▁▁
area_geospa_fabric 0 1.00 808.08 1709.85 4.10 127.98 340.70 804.50 25817.78 ▇▁▁▁▁
frac_forest 0 1.00 0.64 0.37 0.00 0.28 0.81 0.97 1.00 ▃▁▁▂▇
lai_max 0 1.00 3.22 1.52 0.37 1.81 3.37 4.70 5.58 ▅▆▃▅▇
lai_diff 0 1.00 2.45 1.33 0.15 1.20 2.34 3.76 4.83 ▇▇▇▆▇
gvf_max 0 1.00 0.72 0.17 0.18 0.61 0.78 0.86 0.92 ▁▁▂▃▇
gvf_diff 0 1.00 0.32 0.15 0.03 0.19 0.32 0.46 0.65 ▃▇▅▇▁
dom_land_cover_frac 0 1.00 0.81 0.18 0.31 0.65 0.86 1.00 1.00 ▁▂▃▃▇
root_depth_50 24 0.96 0.18 0.03 0.12 0.17 0.18 0.19 0.25 ▃▃▇▂▂
root_depth_99 24 0.96 1.83 0.30 1.50 1.52 1.80 2.00 3.10 ▇▃▂▁▁
# Visualize missing data
vis_miss(camels_data)

# Drop variables with more than 20% missing data
camels_clean <- camels_data %>%
  select(where(~ mean(!is.na(.)) > 0.8)) %>%
  drop_na(q_mean)  # Ensure response variable is present

# Look at correlation of numeric variables
numeric_vars <- camels_clean %>% select(where(is.numeric))

# Plot correlation matrix
cor_matrix <- cor(numeric_vars, use = "complete.obs")
ggcorrplot(cor_matrix, lab = TRUE, lab_size = 2.5)

# Set a seed for reproducibility
set.seed(123)

# Split the cleaned dataset: 80% training, 20% testing
camels_split <- initial_split(camels_clean, prop = 0.8)

# Extract training and testing sets
camels_train <- training(camels_split)
camels_test <- testing(camels_split)

# Optional: check dimensions
dim(camels_train)
[1] 536  57
dim(camels_test)
[1] 134  57
library(recipes)

recipe_data <- recipe(~ ., data = camels_data) %>%
  step_zv(all_predictors()) %>%  # Remove zero variance predictors
  step_novel(all_nominal()) %>%  # Handle new levels in categorical variables
  step_unknown(all_nominal()) %>%  # Handle unknown levels in categorical variables
  step_corr(all_predictors(), threshold = 0.9) %>%  # Remove highly correlated predictors
  step_impute_median(all_numeric()) %>%  # Impute missing numeric values using median
  step_impute_mode(all_nominal()) %>%  # Impute missing nominal values using mode
  step_dummy(all_nominal(), -all_outcomes()) %>%  # Create dummy variables for nominal features
  step_normalize(all_numeric())  # Normalize numeric predictors
library(tidymodels)

# Clean the training data by removing rows with missing or infinite q_mean
camels_train_cleaned <- camels_train %>%
  filter(!is.na(q_mean) & !is.infinite(q_mean) & q_mean != -Inf & q_mean != Inf) %>%
  drop_na()  # Drop any other rows with missing values

library(recipes)

camels_recipe <- recipe(runoff_ratio ~ ., data = camels_train) %>%
  step_zv() %>%  # Remove predictors with zero variance
  step_novel(all_nominal_predictors()) %>%
  step_unknown(all_nominal_predictors()) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_normalize(all_numeric_predictors())

# Create 10-fold cross-validation folds
camels_folds <- vfold_cv(camels_train_cleaned, v = 10)

# Define model specifications without tuning
lin_reg_spec <- linear_reg() %>%
  set_engine("lm") %>%
  set_mode("regression")

# Random Forest with fixed parameters
rf_spec <- rand_forest(mtry = 3, trees = 500) %>%
  set_engine("ranger") %>%
  set_mode("regression")

# XGBoost with fixed parameters
xgb_spec <- boost_tree(mtry = 3, trees = 500, learn_rate = 0.1) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

# Create the workflow set
workflow_set <- workflow_set(
  preproc = list(camels = camels_recipe),
  models = list(
    lin_reg = lin_reg_spec,
    random_forest = rf_spec,
    xgboost = xgb_spec
  )
)

# Fit the workflows with cross-validation and evaluate using RMSE and R-squared
results <- workflow_map(
  workflow_set,
  resamples = camels_folds,
  metrics = metric_set(rmse, rsq),
  verbose = TRUE
)
i   No tuning parameters. `fit_resamples()` will be attempted
i 1 of 3 resampling: camels_lin_reg
→ A | warning: !  The following columns have zero variance so scaling cannot be used:
                 gauge_id_new, gauge_id_unknown, high_prec_timing_new,
                 high_prec_timing_unknown, low_prec_timing_new, low_prec_timing_unknown,
                 geol_1st_class_new, geol_1st_class_unknown, dom_land_cover_new, and
                 dom_land_cover_unknown.
               ℹ Consider using ?step_zv (`?recipes::step_zv()`) to remove those columns
                 before normalizing.
There were issues with some computations   A: x1
→ B | warning: prediction from rank-deficient fit; consider predict(., rankdeficient="NA")
There were issues with some computations   A: x1
There were issues with some computations   A: x2   B: x1
There were issues with some computations   A: x10   B: x10

✔ 1 of 3 resampling: camels_lin_reg (2.8s)
i   No tuning parameters. `fit_resamples()` will be attempted
i 2 of 3 resampling: camels_random_forest
→ A | warning: !  The following columns have zero variance so scaling cannot be used:
                 gauge_id_new, gauge_id_unknown, high_prec_timing_new,
                 high_prec_timing_unknown, low_prec_timing_new, low_prec_timing_unknown,
                 geol_1st_class_new, geol_1st_class_unknown, dom_land_cover_new, and
                 dom_land_cover_unknown.
               ℹ Consider using ?step_zv (`?recipes::step_zv()`) to remove those columns
                 before normalizing.
There were issues with some computations   A: x3
There were issues with some computations   A: x10

✔ 2 of 3 resampling: camels_random_forest (2.2s)
i   No tuning parameters. `fit_resamples()` will be attempted
i 3 of 3 resampling: camels_xgboost
→ A | warning: !  The following columns have zero variance so scaling cannot be used:
                 gauge_id_new, gauge_id_unknown, high_prec_timing_new,
                 high_prec_timing_unknown, low_prec_timing_new, low_prec_timing_unknown,
                 geol_1st_class_new, geol_1st_class_unknown, dom_land_cover_new, and
                 dom_land_cover_unknown.
               ℹ Consider using ?step_zv (`?recipes::step_zv()`) to remove those columns
                 before normalizing.
There were issues with some computations   A: x4
There were issues with some computations   A: x10

✔ 3 of 3 resampling: camels_xgboost (3.7s)
# Visualize resampling results
autoplot(results)

Model Selection Based on the visualized metrics, select a model that you think best performs. Describe the reason for your choice using the metrics.

I chose Boosted Trees because it has the lowest errors with cross-validation and has the highest R-squared.

Describe the model you selected. What is the model type, engine, and mode. Why do you think it is performing well for this problem?

Model Type: Boosted Trees Engine: xgboost Mode: regression It is performing well because XGBoost is good for non-linear relationships, interactions between predictors, dealing with missing values, and avoiding over fitting.

library(tidymodels)

# Tunable XGBoost model specification
xgb_tune_spec <- boost_tree(
  trees = 500,                        # fixed number of trees
  mtry = tune(),                      # tunable: number of variables randomly selected
  learn_rate = tune(),               # tunable: step size shrinkage
  tree_depth = tune()                # tunable: max depth of trees
) %>%
  set_engine("xgboost") %>%
  set_mode("regression")
library(workflows)

# Create a workflow object combining the recipe and the tunable XGBoost model
xgb_workflow <- workflow() %>%
  add_model(xgb_tune_spec) %>%
  add_recipe(camels_recipe)
# Load required library
library(tune)

# Extract tunable parameters and their ranges
dials <- extract_parameter_set_dials(xgb_workflow)

# View the parameter objects and their ranges
dials$object
[[1]]
# Randomly Selected Predictors (quantitative)
Range: [1, ?]

[[2]]
Tree Depth (quantitative)
Range: [1, 15]

[[3]]
Learning Rate (quantitative)
Transformer: log-10 [1e-100, Inf]
Range (transformed scale): [-3, -0.5]
# Load required libraries
library(tidymodels)
library(dials)

tune_spec <- rand_forest(
  mtry = tune(),         # Number of predictors to sample
  min_n = tune(),        # Minimum number of observations in a node
  trees = 500            # Fixed number of trees
) %>%
  set_engine("ranger") %>%
  set_mode("regression")

wf_tune <- workflow() %>%
  add_recipe(camels_recipe) %>%
  add_model(tune_spec)

# Ensure the k-fold cross-validation folds are defined (e.g., camels_folds)
camels_folds <- vfold_cv(camels_train_cleaned, v = 5)  # Example with 5-fold cross-validation

# Extract tunable parameters from the workflow
dials <- extract_parameter_set_dials(wf_tune)

# Optional: If you're using mtry (which depends on number of predictors), finalize it:
dials <- finalize(dials, camels_train_cleaned)

# Now create the grid
my.grid <- grid_space_filling(
  dials,   # No need for $parameters
  size = 25
)

# Tune the model using grid search
model_params <- tune_grid(
  wf_tune,
  resamples = camels_folds,  # Ensure camels_folds is defined as your k-folds
  grid = my.grid,  # Grid of hyperparameters
  metrics = metric_set(rmse, rsq, mae),  # Define evaluation metrics
  control = control_grid(save_pred = TRUE)  # Save predictions
)
→ A | warning: !  The following columns have zero variance so scaling cannot be used:
                 gauge_id_new, gauge_id_unknown, high_prec_timing_new,
                 high_prec_timing_unknown, low_prec_timing_new, low_prec_timing_unknown,
                 geol_1st_class_new, geol_1st_class_unknown, dom_land_cover_new, and
                 dom_land_cover_unknown.
               ℹ Consider using ?step_zv (`?recipes::step_zv()`) to remove those columns
                 before normalizing.
There were issues with some computations   A: x1
There were issues with some computations   A: x2
There were issues with some computations   A: x3
There were issues with some computations   A: x4
There were issues with some computations   A: x5
There were issues with some computations   A: x5
# Visualize the tuning results
autoplot(model_params)

collect_metrics(model_params)
# A tibble: 75 × 8
    mtry min_n .metric .estimator   mean     n std_err .config              
   <int> <int> <chr>   <chr>       <dbl> <int>   <dbl> <chr>                
 1     1    24 mae     standard   0.165      5 0.00943 Preprocessor1_Model01
 2     1    24 rmse    standard   0.217      5 0.0128  Preprocessor1_Model01
 3     1    24 rsq     standard   0.719      5 0.0241  Preprocessor1_Model01
 4     3    13 mae     standard   0.124      5 0.00863 Preprocessor1_Model02
 5     3    13 rmse    standard   0.170      5 0.0126  Preprocessor1_Model02
 6     3    13 rsq     standard   0.757      5 0.0343  Preprocessor1_Model02
 7     5    33 mae     standard   0.100      5 0.00685 Preprocessor1_Model03
 8     5    33 rmse    standard   0.141      5 0.0111  Preprocessor1_Model03
 9     5    33 rsq     standard   0.787      5 0.0284  Preprocessor1_Model03
10     8     5 mae     standard   0.0792     5 0.00557 Preprocessor1_Model04
# ℹ 65 more rows
library(dplyr)

# View metrics ordered by lowest MAE (Mean Absolute Error)
collect_metrics(model_params) %>%
  filter(.metric == "mae") %>%
  arrange(mean)
# A tibble: 25 × 8
    mtry min_n .metric .estimator   mean     n std_err .config              
   <int> <int> <chr>   <chr>       <dbl> <int>   <dbl> <chr>                
 1    50     6 mae     standard   0.0482     5 0.00524 Preprocessor1_Model22
 2    54    14 mae     standard   0.0490     5 0.00532 Preprocessor1_Model24
 3    38     2 mae     standard   0.0507     5 0.00529 Preprocessor1_Model17
 4    52    22 mae     standard   0.0509     5 0.00509 Preprocessor1_Model23
 5    40     9 mae     standard   0.0512     5 0.00527 Preprocessor1_Model18
 6    57    32 mae     standard   0.0515     5 0.00524 Preprocessor1_Model25
 7    43    17 mae     standard   0.0523     5 0.00507 Preprocessor1_Model19
 8    45    28 mae     standard   0.0539     5 0.00534 Preprocessor1_Model20
 9    29     8 mae     standard   0.0549     5 0.00541 Preprocessor1_Model13
10    47    38 mae     standard   0.0557     5 0.00535 Preprocessor1_Model21
# ℹ 15 more rows
show_best(model_params, metric = "mae")
# A tibble: 5 × 8
   mtry min_n .metric .estimator   mean     n std_err .config              
  <int> <int> <chr>   <chr>       <dbl> <int>   <dbl> <chr>                
1    50     6 mae     standard   0.0482     5 0.00524 Preprocessor1_Model22
2    54    14 mae     standard   0.0490     5 0.00532 Preprocessor1_Model24
3    38     2 mae     standard   0.0507     5 0.00529 Preprocessor1_Model17
4    52    22 mae     standard   0.0509     5 0.00509 Preprocessor1_Model23
5    40     9 mae     standard   0.0512     5 0.00527 Preprocessor1_Model18
hp_best <- select_best(model_params, metric = "mae")
final_wf <- finalize_workflow(
  wf_tune,
  hp_best
)
final_fit <- last_fit(
  final_wf,        # your finalized workflow
  split = camels_split  # original data split from initial_split()
)
→ A | warning: !  The following columns have zero variance so scaling cannot be used:
                 gauge_id_new, gauge_id_unknown, high_prec_timing_new,
                 high_prec_timing_unknown, low_prec_timing_new, low_prec_timing_unknown,
                 geol_1st_class_new, geol_1st_class_unknown, dom_land_cover_new, and
                 dom_land_cover_unknown.
               ℹ Consider using ?step_zv (`?recipes::step_zv()`) to remove those columns
                 before normalizing.
metrics_test <- collect_metrics(final_fit)
print(metrics_test)
# A tibble: 2 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 rmse    standard      0.0622 Preprocessor1_Model1
2 rsq     standard      0.921  Preprocessor1_Model1
predictions_test <- collect_predictions(final_fit)
library(ggplot2)

# Create scatter plot with actual vs predicted values
ggplot(predictions_test, aes(x = runoff_ratio, y = .pred)) +
  geom_point(aes(color = runoff_ratio), alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  geom_abline(intercept = 0, slope = 1, color = "red") +
  scale_color_viridis_c() +
  labs(
    title = "Predicted vs Actual Values",
    x = "Actual Values (Runoff Ratio)",
    y = "Predicted Values"
  ) +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

camels_data_clean <- camels_data %>%
  filter(!is.na(runoff_ratio))
# Fit the finalized workflow to the full cleaned data
final_fit_full <- fit(final_wf, data = camels_data_clean)
Warning: !  The following columns have zero variance so scaling cannot be used:
  gauge_id_new, gauge_id_unknown, high_prec_timing_new,
  high_prec_timing_unknown, low_prec_timing_new, low_prec_timing_unknown,
  geol_1st_class_new, geol_1st_class_unknown, dom_land_cover_new, and
  dom_land_cover_unknown.
ℹ Consider using ?step_zv (`?recipes::step_zv()`) to remove those columns
  before normalizing.
library(broom)
augmented_preds <- augment(final_fit_full, new_data = camels_data_clean)
library(dplyr)
augmented_preds <- augmented_preds %>%
  mutate(residual = (.pred - runoff_ratio)^2)  # or .pred - runoff_ratio for raw residuals
library(ggplot2)
library(patchwork)

# Prediction map
pred_map <- ggplot(augmented_preds, aes(x = gauge_lon, y = gauge_lat, color = .pred)) +
  geom_point(size = 2) +
  scale_color_viridis_c(option = "C") +
  labs(title = "Predicted Runoff Ratio", color = "Prediction") +
  theme_minimal()

# Residual map
resid_map <- ggplot(augmented_preds, aes(x = gauge_lon, y = gauge_lat, color = residual)) +
  geom_point(size = 2) +
  scale_color_viridis_c(option = "A") +
  labs(title = "Residuals (Squared)", color = "Residual") +
  theme_minimal()

# Combine with patchwork
pred_map + resid_map